{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install faker\n",
    "!pip install openai\n",
    "!pip install elasticsearch\n",
    "!pip install sklearn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Log generation function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import openai\n",
    "import ast\n",
    "import json\n",
    "from elasticsearch import Elasticsearch, helpers\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from faker import Faker\n",
    "import random\n",
    "\n",
    "fake = Faker()\n",
    "\n",
    "# 1. Apache HTTP Server (Common Log Format)\n",
    "def generate_apache_log():\n",
    "    return '{RemoteHost} - - [{Timestamp}] \"{RequestMethod} {RequestURI} {Protocol}\" {StatusCode} {ResponseSize}'.format(\n",
    "        RemoteHost=fake.ipv4(),\n",
    "        Timestamp=fake.date_time_this_year().strftime('%d/%b/%Y:%H:%M:%S %z'),\n",
    "        RequestMethod=fake.http_method(),\n",
    "        RequestURI=fake.uri(),\n",
    "        Protocol='HTTP/1.1',\n",
    "        StatusCode=random.choice([200, 404, 500]),\n",
    "        ResponseSize=random.randint(100, 10000)\n",
    "    )\n",
    "\n",
    "# 2. Nginx (Combined Log Format)\n",
    "def generate_nginx_log():\n",
    "    return '{RemoteAddress} - {RemoteUser} [{Timestamp}] \"{RequestMethod} {RequestURI} {Protocol}\" {StatusCode} {ResponseSize} \"{Referer}\" \"{UserAgent}\"'.format(\n",
    "        RemoteAddress=fake.ipv4(),\n",
    "        RemoteUser='-',\n",
    "        Timestamp=fake.date_time_this_year().strftime('%d/%b/%Y:%H:%M:%S %z'),\n",
    "        RequestMethod=fake.http_method(),\n",
    "        RequestURI=fake.uri(),\n",
    "        Protocol='HTTP/1.1',\n",
    "        StatusCode=random.choice([200, 404, 500]),\n",
    "        ResponseSize=random.randint(100, 10000),\n",
    "        Referer=fake.uri(),\n",
    "        UserAgent=fake.user_agent()\n",
    "    )\n",
    "\n",
    "# 3. Syslog (RFC 5424)\n",
    "def generate_syslog():\n",
    "    return '<{Priority}>{Version} {Timestamp} {Hostname} {AppName} {ProcID} {MsgID} {StructuredData} {Message}'.format(\n",
    "        Priority=random.randint(1, 191),\n",
    "        Version=1,\n",
    "        Timestamp=fake.date_time_this_year().isoformat(),\n",
    "        Hostname=fake.hostname(),\n",
    "        AppName=fake.word(),\n",
    "        ProcID=random.randint(1000, 9999),\n",
    "        MsgID=random.randint(1000, 9999),\n",
    "        StructuredData='-',\n",
    "        Message=fake.sentence()\n",
    "    )\n",
    "\n",
    "# 4. AWS CloudTrail\n",
    "def generate_aws_cloudtrail_log():\n",
    "    return '{{\"eventVersion\": \"{EventVersion}\", \"userIdentity\": {{\"type\": \"IAMUser\", \"userName\": \"{UserName}\"}}, \"eventTime\": \"{Timestamp}\", \"eventSource\": \"{EventSource}\", \"eventName\": \"{EventName}\", \"awsRegion\": \"{AwsRegion}\", \"sourceIPAddress\": \"{SourceIPAddress}\", \"userAgent\": \"{UserAgent}\", \"requestParameters\": {{\"key\": \"value\"}}, \"responseElements\": {{\"key\": \"value\"}}, \"requestID\": \"{RequestId}\", \"eventID\": \"{EventId}\", \"eventType\": \"AwsApiCall\", \"recipientAccountId\": \"{RecipientAccountId}\"}}'.format(\n",
    "        EventVersion='1.08',\n",
    "        UserName=fake.user_name(),\n",
    "        Timestamp=fake.date_time_this_year().isoformat(),\n",
    "        EventSource='s3.amazonaws.com',\n",
    "        EventName='GetObject',\n",
    "        AwsRegion='us-east-1',\n",
    "        SourceIPAddress=fake.ipv4(),\n",
    "        UserAgent=fake.user_agent(),\n",
    "        RequestId=fake.uuid4(),\n",
    "        EventId=fake.uuid4(),\n",
    "        RecipientAccountId=fake.random_number(digits=12)\n",
    "    )\n",
    "\n",
    "# 5. Microsoft Windows Event Log\n",
    "def generate_windows_event_log():\n",
    "    return '<Event xmlns=\"http://schemas.microsoft.com/win/2004/08/events/event\"><System><Provider Name=\"{ProviderName}\"/><EventID>{EventID}</EventID><Level>{Level}</Level><TimeCreated SystemTime=\"{Timestamp}\"/><SourceName>{SourceName}</SourceName><Computer>{Computer}</Computer></System><EventData>{Message}</EventData></Event>'.format(\n",
    "        ProviderName=fake.word(),\n",
    "        EventID=random.randint(1000, 9999),\n",
    "        Level=random.randint(1, 5),\n",
    "        Timestamp=fake.date_time_this_year().isoformat(),\n",
    "        SourceName=fake.word(),\n",
    "        Computer=fake.hostname(),\n",
    "        Message=fake.sentence()\n",
    "    )\n",
    "\n",
    "# 6. Linux Audit Log\n",
    "def generate_linux_audit_log():\n",
    "    return 'type={AuditType} msg=audit({Timestamp}): {Message}'.format(\n",
    "        AuditType=fake.word(),\n",
    "        Timestamp=fake.date_time_this_year().isoformat(),\n",
    "        Message=fake.sentence()\n",
    "    )\n",
    "\n",
    "def generate_logs(sources, total_logs, random_logs):\n",
    "    # Mapping source names to their corresponding log generation functions\n",
    "    source_to_function = {\n",
    "        'apache': generate_apache_log,\n",
    "        'nginx': generate_nginx_log,\n",
    "        'syslog': generate_syslog,\n",
    "        'aws_cloudtrail': generate_aws_cloudtrail_log,\n",
    "        'windows_event': generate_windows_event_log,\n",
    "        'linux_audit': generate_linux_audit_log,\n",
    "    }\n",
    "    \n",
    "    # Calculate the number of logs to generate for each source\n",
    "    num_sources = len(sources)\n",
    "    logs_per_source = [total_logs // num_sources] * num_sources\n",
    "    if random_logs:\n",
    "        for i in range(total_logs % num_sources):\n",
    "            logs_per_source[i] += 1\n",
    "        random.shuffle(logs_per_source)\n",
    "    \n",
    "    # Generate the logs and append them to the list\n",
    "    generated_logs = []\n",
    "    for source, num_logs in zip(sources, logs_per_source):\n",
    "        log_function = source_to_function[source]\n",
    "        for _ in range(num_logs):\n",
    "            generated_logs.append(log_function())\n",
    "    \n",
    "    return generated_logs\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Log Expansion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Example usage\n",
    "sources_to_use = ['apache']\n",
    "total_logs_to_generate = 15\n",
    "random_logs_per_source = True\n",
    "logs = generate_logs(sources_to_use, total_logs_to_generate, random_logs_per_source)\n",
    "\n",
    "\n",
    "stringifiedPromptsArray = json.dumps(logs)\n",
    "\n",
    "print(\"Logs: \")\n",
    "print(logs)\n",
    "\n",
    "prompts = [\n",
    "    {\n",
    "    \"role\": \"user\",\n",
    "    \"content\": stringifiedPromptsArray\n",
    "}\n",
    "]\n",
    "\n",
    "batchInstruction = {\n",
    "    \"role\":\n",
    "    \"system\",\n",
    "    \"content\":\n",
    "    \"Explain what happened for each log line of the array. Return a python array of the explanation. Only the array, no text around it or any extra comment, nothing else than the array should be in the answer. Don't forget in your completion to give the day, date and year of the log. Interpret some of the log content if you can, for example you have to translate what an error code 500.\"\n",
    "}\n",
    "\n",
    "prompts.append(batchInstruction)\n",
    "print(\"ChatGPT: \")\n",
    "\n",
    "\n",
    "# Define the OpenAI API key and Elasticsearch connection details\n",
    "openai_api_key = \"OPENAI_API_KEY\"\n",
    "\n",
    "# Initialize the OpenAI API client\n",
    "openai.api_key = openai_api_key\n",
    "\n",
    "stringifiedBatchCompletion = openai.ChatCompletion.create(model=\"gpt-3.5-turbo\",\n",
    "                                         messages=prompts,\n",
    "                                         max_tokens=1000)\n",
    "print(stringifiedBatchCompletion.choices[0].message.content)\n",
    "batchCompletion = ast.literal_eval(stringifiedBatchCompletion.choices[0].message.content)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Log Vectorization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Initialize the Elasticsearch client\n",
    "es = Elasticsearch(\n",
    "    ['ELASTIC_CLUSTER_HOSTNAME:ELASTIC_CLUSTER_PORT'],\n",
    "    basic_auth=('ELASTIC_USERNAME', 'ELASTIC_PASSWORD'),\n",
    "    verify_certs=False\n",
    ")\n",
    "\n",
    "# Define the index configuration\n",
    "index_config = {\n",
    "    \"mappings\": {\n",
    "        \"properties\": {\n",
    "            \"description_vectorized\": {\n",
    "                \"type\": \"dense_vector\",\n",
    "                \"dims\": 768,\n",
    "                \"index\": True,\n",
    "                \"similarity\": \"cosine\"\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "# Create the index\n",
    "response = es.indices.create(index='logs', body=index_config)\n",
    "\n",
    "# Generate the sequence of JSON documents for a bulk index operation\n",
    "bulk_index_body = []\n",
    "for index, log in enumerate(batchCompletion):\n",
    "    document = {\n",
    "        \"_index\": \"logs\", \n",
    "        \"pipeline\": \"vectorize-log\",\n",
    "        \"_source\": {\n",
    "            \"text_field\": log, \"log\": logs[index]\n",
    "        }\n",
    "    }\n",
    "    bulk_index_body.append(document)\n",
    "\n",
    "# Convert the bulk index body to a single string with newline separators\n",
    "print(\"Bulk request: \")\n",
    "print(bulk_index_body)\n",
    "\n",
    "try:\n",
    "    response = helpers.bulk(es, bulk_index_body)\n",
    "    print (\"\\nRESPONSE:\", response)\n",
    "except Exception as e:\n",
    "    print(\"\\nERROR:\", e)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Semantic Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Search ElasticSearch index and return body and URL of the result\n",
    "def ESSearch(query_text):\n",
    "  # Elasticsearch query (BM25) and kNN configuration for hybrid search\n",
    "  query = {\n",
    "    \"bool\": {\n",
    "      \"filter\": [{\n",
    "        \"exists\": {\n",
    "          \"field\": \"description_vectorized\"\n",
    "        }\n",
    "      }]\n",
    "    }\n",
    "  }\n",
    "\n",
    "  knn = {\n",
    "    \"field\": \"description_vectorized\",\n",
    "    \"k\": 1,\n",
    "    \"num_candidates\": 20,\n",
    "    \"query_vector_builder\": {\n",
    "      \"text_embedding\": {\n",
    "        \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n",
    "        \"model_text\": query_text\n",
    "      }\n",
    "    },\n",
    "    \"boost\": 24\n",
    "  }\n",
    "\n",
    "  fields = [\"text_field\"]\n",
    "  index = 'logs'\n",
    "  resp = es.search(index=index,\n",
    "                   query=query,\n",
    "                   knn=knn,\n",
    "                   fields=fields,\n",
    "                   size=1,\n",
    "                   source=False)\n",
    "\n",
    "  print(resp['hits']['hits'][0]['fields']['text_field'][0])\n",
    "  return resp['hits']['hits'][0]['fields']['text_field'][0]\n",
    "\n",
    "\n",
    "ESSearch(\"Were there any error in March?\")"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
